#!/usr/bin/env python

"""
Import Hadoop pipes/utils source code.

NOTE: starting from cdh4.3, there is a single Hadoop tarball with both
mr2 and mr1 code. The latter is located in:
${HADOOP_HOME}/src/hadoop-mapreduce1-project/. To fetch the code for
mrv1, run import_src ${HADOOP_HOME}/src/hadoop-mapreduce1-project; to
fetch the code for mrv2, run import_src ${HADOOP_HOME} --skip-dir
hadoop-mapreduce1-project.
"""

import sys, os, argparse, warnings, shutil


WANTED = {  # basename: relative location
  "StringUtils.cc": "utils/impl",
  "SerialUtils.cc": "utils/impl",
  "StringUtils.hh": "utils/api/hadoop",
  "SerialUtils.hh": "utils/api/hadoop",
  "HadoopPipes.cc": "pipes/impl",
  "Pipes.hh": "pipes/api/hadoop",
  "TemplateFactory.hh": "pipes/api/hadoop",
  #--- libhdfs, all versions ---
  "hdfs.h": "libhdfs",
  "hdfs.c": "libhdfs",
  # --- libhdfs, old versions ---
  "hdfsJniHelper.h": "libhdfs",
  "hdfsJniHelper.c": "libhdfs",
  # --- libhdfs, recent versions ---
  "jni_helper.h": "libhdfs",
  "jni_helper.c": "libhdfs",
  "native_mini_dfs.h": "libhdfs",
  "native_mini_dfs.c": "libhdfs",
  "exception.h": "libhdfs",
  "exception.c": "libhdfs",
  # --- java pipes ---
  "Application.java": "org/apache/hadoop/mapred/pipes",
  "BinaryProtocol.java": "org/apache/hadoop/mapred/pipes",
  "DownwardProtocol.java": "org/apache/hadoop/mapred/pipes",
  "OutputHandler.java": "org/apache/hadoop/mapred/pipes",
  "PipesMapRunner.java": "org/apache/hadoop/mapred/pipes",
  "PipesNonJavaInputFormat.java": "org/apache/hadoop/mapred/pipes",
  "PipesPartitioner.java": "org/apache/hadoop/mapred/pipes",
  "PipesReducer.java": "org/apache/hadoop/mapred/pipes",
  "Submitter.java": "org/apache/hadoop/mapred/pipes",
  "UpwardProtocol.java": "org/apache/hadoop/mapred/pipes",
  "LocalJobRunner.java": "org/apache/hadoop/mapred",
  }


def get_sources(root_dir, skip=None):
  sources = {}
  for d, _, basenames in os.walk(root_dir):
    if skip in d.split(os.sep):
      continue
    for bn in basenames:
      if bn in WANTED:
        if d.endswith(WANTED[bn]):
          sources[bn] = os.path.join(d, bn)
  missing = set(WANTED) - set(sources)
  if missing:
    warnings.warn("not found: %r" % (sorted(missing),))
  return sources


def make_parser():
  parser = argparse.ArgumentParser(description=__doc__)
  parser.add_argument('hadoop_home', metavar="HADOOP_HOME")
  parser.add_argument("-o", "--output-dir", metavar="DIR",
                      help="output directory")
  parser.add_argument("-s", "--skip-dir", metavar="DIR",
                      help="skip directories with this basename")
  return parser


def main(argv):
  parser = make_parser()
  args = parser.parse_args(argv[1:])
  if not args.output_dir:
    this_dir = os.path.dirname(os.path.abspath(__file__))
    parent_dir = os.path.dirname(this_dir)
    args.output_dir = os.path.join(
      parent_dir, "src", os.path.basename(args.hadoop_home.rstrip("/"))
      )
  if args.skip_dir:
    args.skip_dir = os.path.basename(args.skip_dir)
  sources = get_sources(args.hadoop_home, skip=args.skip_dir)
  for bn, p in sources.iteritems():
    out_dir = os.path.join(args.output_dir, WANTED[bn])
    try:
      os.makedirs(out_dir)
    except OSError:
      pass
    shutil.copy(p, out_dir)
    print "%s -> %s" % (p, out_dir) 


if __name__ == "__main__":
  main(sys.argv)
